/*
* Copyright (C) 2012, The Linux Foundation. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*     * Redistributions of source code must retain the above copyright
*       notice, this list of conditions and the following disclaimer.
*     * Redistributions in binary form must reproduce the above
*       copyright notice, this list of conditions and the following
*       disclaimer in the documentation and/or other materials provided
*       with the distribution.
*     * Neither the name of The Linux Foundation nor the names of its
*       contributors may be used to endorse or promote products derived
*       from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
* ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

@ NEON optimized assembly routine of kf_bfly4()

    .text
    .fpu neon
    .align 4
    .global     kf_bfly4
    .func       kf_bfly4

kf_bfly4:
    stmdb           sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
@   vstmdb          sp!, {d8-d15}
                                                @ r0 - Fout| r1 - fstride | r2 - st | r3 - m
    pld             [r0, #0]
    mov             r5, r3
    mov             r3, r3, asl #3              @ convert m into bytes count (m*8)
    add             r6, r2, #264                @ tw1 = st->twiddles
    pld             [r6, #0]
    mov             r7, r6                      @ tw2 = st->twiddles
    mov             r8, r7                      @ tw3 = st->twiddles
    ldr             r2, [r2, #4]                @ st->inverse
    mov             r1, r1, asl #3              @ convert fstride into bytes count (fstride*8)
    mov             r9, r1, asl #1              @ fstride*2
    add             r10, r1, r9                 @ fstride*3
                                                @ float32x4x2_t rfout;       q0, q1 (d0-d3)
                                                @ float32x4x2_t tmp;         q2, q3 (d4-d7)
                                                @ float32x4x2_t scratch0;    q12, q13 (d24-d27)
                                                @ float32x4x2_t scratch1;    q14, q15 (d28-d31)
                                                @ float32x4x2_t scratch2;    q8, q9 (d16-d19)
                                                @ float32x4x2_t scratch3;    q10, q11 (d20-d23)
    asrs            r4, r5, #2                  @ size_t k=m/4;
    beq             .kf_bfly4_do_while1         @ if(k==0)

.kf_bfly4_do_while4:                            @ do { //process 4 samples per iteration
    add             r11, r0, r3                 @ fom = Fout+m;
    mov             r12, r11
    pld             [r7, #0]
    vld1.32         {d20}, [r6], r1             @ rtwd1 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
    vld1.32         {d21}, [r6], r1             @ rtwd2 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
    vld1.32         {d22}, [r6], r1             @ rtwd3 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
    vld1.32         {d23}, [r6], r1             @ rtwd4 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
    vuzp.32         q10, q11                    @ scratch3 = vuzpq_f32(vcombine_f32(rtwd1, rtwd2), vcombine_f32(rtwd3, rtwd4));
    vld2.32         {d0-d3}, [r11], r3          @ rfout = vld2q_f32((const float32_t*)(fom1)); fom2 = Fout+m2;
    vmul.f32        q2, q0, q10                 @ C_MUL_NEON(scratch0, rfout, scratch3);
    vmul.f32        q3, q1, q11
    vsub.f32        q12, q2, q3
    vmul.f32        q2, q0, q11
    vmul.f32        q3, q1, q10
    vadd.f32        q13, q2, q3

    pld             [r8, #0]
    vld1.32         {d20}, [r7], r9             @ rtwd1 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
    vld1.32         {d21}, [r7], r9             @ rtwd2 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
    vld1.32         {d22}, [r7], r9             @ rtwd3 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
    vld1.32         {d23}, [r7], r9             @ trtwd4 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
    vuzp.32         q10, q11                    @ scratch3 = vuzpq_f32(vcombine_f32(rtwd1, rtwd2), vcombine_f32(rtwd3, rtwd4));
    vld2.32         {d0-d3}, [r11], r3          @ rfout = vld2q_f32((const float32_t*)(fom2)); fom3 = Fout+m3;
    vmul.f32        q2, q0, q10                 @ C_MUL_NEON(scratch1, rfout, scratch3);
    vmul.f32        q3, q1, q11
    vsub.f32        q14, q2, q3
    vmul.f32        q2, q0, q11
    vmul.f32        q3, q1, q10
    vadd.f32        q15, q2, q3

    pld             [r0, #0]
    vld1.32         {d20}, [r8], r10            @ rtwd1 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
    vld1.32         {d21}, [r8], r10            @ rtwd2 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
    vld1.32         {d22}, [r8], r10            @ rtwd3 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
    vld1.32         {d23}, [r8], r10            @ rtwd4 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
    vuzp.32         q10, q11                    @ scratch3 = vuzpq_f32(vcombine_f32(rtwd1, rtwd2), vcombine_f32(rtwd3, rtwd4));
    vld2.32         {d0-d3}, [r11]              @ rfout = vld2q_f32((const float32_t*)(fom3));
    vmul.f32        q2, q0, q10                 @ C_MUL_NEON(scratch2, rfout, scratch3);
    vmul.f32        q3, q1, q11
    vsub.f32        q8, q2, q3
    vmul.f32        q2, q0, q11
    vmul.f32        q3, q1, q10
    vadd.f32        q9, q2, q3

    vld2.32         {d0-d3}, [r0]               @ rfout = vld2q_f32((const float32_t*)(Fout));
    vsub.f32        q2, q0, q14                 @ C_SUB_NEON(tmp, rfout, scratch1 );
    vsub.f32        q3, q1, q15

    vadd.f32        q0, q0, q14                 @ C_ADD_NEON(rfout, rfout, scratch1);
    vadd.f32        q1, q1, q15

    vadd.f32        q10, q12, q8                @ C_ADD_NEON(scratch3, scratch0, scratch2);
    vadd.f32        q11, q13, q9

    vsub.f32        q12, q12, q8                @ C_SUB_NEON(scratch0, scratch0, scratch2);
    vsub.f32        q13, q13, q9

    vsub.f32        q8, q0, q10                 @ C_SUB_NEON(scratch2, rfout, scratch3);
    vsub.f32        q9, q1, q11

    vadd.f32        q0, q0, q10                 @ C_ADD_NEON(rfout, rfout, scratch3);
    vadd.f32        q1, q1, q11
    vst2.32         {d0-d3}, [r0]!              @ vst2q_f32((float32_t*)Fout, rfout); Fout+=4;;

    cmp             r2, #0
    beq             .not_inverse4               @ if(st->inverse) {
    vsub.f32        q10, q2, q13                @ scratch3.val[0] = vsubq_f32(tmp.val[0], scratch0.val[1]);
    vadd.f32        q11, q3, q12                @ scratch3.val[1] = vaddq_f32(tmp.val[1], scratch0.val[0]);
    vadd.f32        q14, q2, q13                @ scratch1.val[0] = vaddq_f32(tmp.val[0], scratch0.val[1]);
    vsub.f32        q15, q3, q12                @ scratch1.val[1] = vsubq_f32(tmp.val[1], scratch0.val[0]);
    b               .c_end4
.not_inverse4:                                  @ } else {
    vadd.f32        q10, q2, q13                @ scratch3.val[0] = vaddq_f32(tmp.val[0], scratch0.val[1]);
    vsub.f32        q11, q3, q12                @ scratch3.val[1] = vsubq_f32(tmp.val[1], scratch0.val[0]);
    vsub.f32        q14, q2, q13                @ scratch1.val[0] = vsubq_f32(tmp.val[0], scratch0.val[1]);
    vadd.f32        q15, q3, q12                @ scratch1.val[1] = vaddq_f32(tmp.val[1], scratch0.val[0]);
                                                @ }
.c_end4:
    vst2.32         {d20-d23}, [r12], r3        @ vst2q_f32((float32_t*)(fom), scratch3); fom2 = Fout+m2;
    vst2.32         {d16-d19}, [r12], r3        @ vst2q_f32((float32_t*)fom2, scratch2); fom3 = Fout+m3;
    vst2.32         {d28-d31}, [r12]            @ vst2q_f32((float32_t*)(fom3), scratch1);

    pld             [r6, #0]

    subs            r4, r4, #1                  @ }while(--k);
    bne             .kf_bfly4_do_while4

@.kf_bfly4_process_singles:
    asr             r4, r5, #31
    lsr             r4, r4, #30
    add             r4, r4, r5
    ands            r5, r4, #3                  @ if (k%4 == 0)
    beq             .kf_bfly4_done

.kf_bfly4_do_while1:                            @ do { //process 1 sample per iteration
    pld             [r7, #0]
    vld1.32         {d18}, [r6], r1             @ rtwd1 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
    vuzp.32         d18, d19                    @ scratch3 = vuzp_f32(rtwd1, rtwd2); //d11 is empty
    add             r12, r0, r3                 @ fom = Fout+m;
    vld1.32         {d0}, [r12], r3             @ rfout = vld2_f32((const float32_t*)(fom1)); fom2 = Fout+m2;
    vuzp.32         d0, d1                      @ d1 is empty
    vmul.f32        q1, q0, q9                  @ C_MUL_NEON(scratch0, rfout, scratch3);
    vsub.f32        d4, d2, d3
    vmul.f32        d2, d0, d19
    vmul.f32        d3, d1, d18
    vadd.f32        d5, d2, d3

    pld             [r8, #0]
    vld1.32         {d18}, [r7], r9             @ rtwd1 = vld1_f32((const float32_t*)tw2); tw2+= fstride*2;
    vuzp.32         d18, d19                    @ scratch3 = vuzp_f32(rtwd1, rtwd2); //d11 is empty
    vld1.32         {d0}, [r12], r3             @ rfout = vld2_f32((const float32_t*)(fom2)); fom3 = Fout+m3;
    vuzp.32         d0, d1                      @ d1 is empty
    vmul.f32        q1, q0, q9                  @ C_MUL_NEON(scratch1, rfout, scratch3);
    vsub.f32        d6, d2, d3
    vmul.f32        d2, d0, d19
    vmul.f32        d3, d1, d18
    vadd.f32        d7, d2, d3

    pld             [r0, #0]
    vld1.32         {d18}, [r8], r10            @ rtwd1 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
    vuzp.32         d18, d19                    @ scratch3 = vuzp_f32(rtwd1, rtwd2); //d11 is empty
    vld1.32         {d0}, [r12]                 @ rfout = vld2_f32((const float32_t*)(fom3));
    vuzp.32         d0, d1                      @ d1 is empty
    vmul.f32        q1, q0, q9                  @ C_MUL_NEON(scratch2, rfout, scratch3);
    vsub.f32        d16, d2, d3
    vmul.f32        d2, d0, d19
    vmul.f32        d3, d1, d18
    vadd.f32        d17, d2, d3

    vld1.32         {d0}, [r0]                  @ rfout = vld2_f32((const float32_t*)(Fout));
    vuzp.32         d0, d1
    vsub.f32        q1, q0, q3                  @ C_SUB_NEON(tmp, rfout, scratch1 );

    vadd.f32        q0, q0, q3                  @ C_ADD_NEON(rfout, rfout, scratch1);

    vadd.f32        q9, q2, q8                  @ C_ADD_NEON(scratch3, scratch0, scratch2);

    vsub.f32        q2, q2, q8                  @ C_SUB_NEON(scratch0, scratch0, scratch2);

    vsub.f32        q8, q0, q9                  @ C_SUB_NEON(scratch2, rfout, scratch3);

    vadd.f32        q0, q0, q9                  @ C_ADD_NEON(rfout, rfout, scratch3);

    cmp             r2, #0
    beq             .not_inverse1               @ if(st->inverse) {
    vsub.f32        d18, d2, d5                 @ scratch3.val[0] = vsub_f32(tmp.val[0], scratch0.val[1]);
    vadd.f32        d19, d3, d4                 @ scratch3.val[1] = vadd_f32(tmp.val[1], scratch0.val[0]);
    vadd.f32        d6, d2, d5                  @ scratch1.val[0] = vadd_f32(tmp.val[0], scratch0.val[1]);
    vsub.f32        d7, d3, d4                  @ scratch1.val[1] = vsub_f32(tmp.val[1], scratch0.val[0]);
    b               .c_end1
.not_inverse1:                                  @ } else {
    vadd.f32        d18, d2, d5                 @ scratch3.val[0] = vadd_f32(tmp.val[0], scratch0.val[1]);
    vsub.f32        d19, d3, d4                 @ scratch3.val[1] = vsub_f32(tmp.val[1], scratch0.val[0]);
    vsub.f32        d6, d2, d5                  @ scratch1.val[0] = vsub_f32(tmp.val[0], scratch0.val[1]);
    vadd.f32        d7, d3, d4                  @ scratch1.val[1] = vadd_f32(tmp.val[1], scratch0.val[0]);
                                                @ }
.c_end1:
    mov             r12, r0
    vzip.32         d0, d1
    vst1.32         {d0}, [r12], r3             @ vst2_f32((float32_t*)Fout, rfout); fom = Fout+m;

    vzip.32         d18, d19
    vst1.32         {d18}, [r12], r3            @ vst2_f32((float32_t*)(fom), scratch3); fom2 = Fout+m2;

    vzip.32         d16, d17
    vst1.32         {d16}, [r12], r3            @ vst2_f32((float32_t*)fom2, scratch2);  fom3 = Fout+m3;

    vzip.32         d6, d7
    vst1.32         {d6}, [r12]                 @ vst2_f32((float32_t*)(fom3), scratch1);

    add             r0, r0, #8                  @ Fout+=1;
    pld             [r6, #0]

    subs            r5, r5, #1                  @ }while(--k);
    bne             .kf_bfly4_do_while1

.kf_bfly4_done:
@   vldmia          sp!, {d8-d15}
    ldmia           sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
    nop

    .endfunc
    .end

